# Imported Inequality 

# Code for replication of graphs (file 2 of 2)



# Employment v. investment ------------------------------------------------

## Figure: proportion of migrants by income percentile in the cross-section

# upload data on top shares
data_ts <- read_csv("output/ts_migrant_comb_18plus.csv")
#recode grouping variable
data_ts <- data_ts %>% 
  mutate(group = case_when(
    group == "all" ~ "all",
    group == "dummy_on" ~ "migrant",
    group == "dummy_off_noreplacem" ~ "brits_noreplace",
    group == "dummy_off_withreplacem" ~ "brits_withreplace",
    TRUE ~ NA_character_
  ))


# upload top share formatted
dd <- read_csv('output/topshares_newti.csv')

## prepare dataset

# list of income variables
varlist <- quos(topshare_ti, topshare_tii, topshare_tei, topshare_emp, topshare_self)
varlist_stub <- c('ti', 'inv', 'tei', 'emp', 'self')


# looping over income variables
dgraph <- map2_dfr(varlist, varlist_stub, function(income_var, stub){
  
  # calculate share of migrants in each share 
  dgraph <- map_dfr(c(1, 0.1, 0.01, 0.001)/100, function(x) {
    
    # proportion of income
    pi <- dd %>%
      filter(group == 'migrant' | group == 'all') %>%
      filter(ts == x) %>%
      mutate(inc_var = !!income_var) %>%  #### <----- choose income variable here ######
      select(inc_var, group, tax_year) %>%
      spread(key = 'group', value = 'inc_var', sep = '_') %>%
      mutate(prop_income = group_migrant) %>%
      select(tax_year, prop_income)
    
    # proportion of total count
    pp <- data_ts %>%
      filter(ts == x) %>%
      filter(group == 'all') %>%
      mutate(prop = migrants_count/topshare_count) %>%
      select(tax_year, prop)
    
    # data
    dgraph <- left_join(pi, pp, by = 'tax_year')
    
    # top share
    dgraph <- dgraph %>%
      mutate(ts = x)
    
    # return
    return(dgraph)
    
  })
  
  
  # top share as character
  dgraph <- dgraph %>%
    mutate(ts = case_when(ts == .01 ~ "Top 1",
                          ts == .001 ~ "Top 0.1",
                          ts == .0001 ~ "Top 0.01",
                          ts == .00001 ~ "Top 0.001"),
           income_var = stub)
  
  # return
  return(dgraph)

})  
  
  

# # # #

# order income categories
dgraph <- dgraph %>%
  mutate(income_cat = case_when(income_var == 'emp' ~ 'a',
                                income_var == 'self' ~ 'b',
                                income_var == 'tei' ~ 'c',
                                income_var == 'inv' ~ "d",
                                income_var == 'ti' ~ 'e',
                                TRUE ~ NA_character_))




# Fig 3a

# panel (a): Earnings vs Investment income

dgraph %>%
  filter(ts == 'Top 1') %>%
  filter(income_var == 'tei' | income_var == 'inv') %>%
  group_by(income_var) %>%
  mutate(prop_income = prop_income/prop_income[1]) %>%
  ggplot(., aes(tax_year, prop_income, group = income_cat, color = income_cat,
                shape = income_cat)) +
  geom_point() +
  geom_line() +
  scale_y_continuous(limits = c(0, 3.5)) +
  scale_shape_manual(values = c(15, 16),
                     labels = c('Earnings', 'Investment')) +
  scale_color_manual(values = c('navy', 'dodgerblue'),
                     labels = c('Earnings', 'Investment')) +
  xlab('Years') +
  ylab('Top Share, index (1997 = 1)') +
  scale_x_continuous(breaks = seq(1997, 2018, 3)) +
  my_theme +
  theme(legend.position = 'bottom',
        legend.title = element_blank()) +
  theme(text = element_text(size = 16))


# save graph
ggsave(filename = "graph_3a.pdf", path = outpath,  dpi = "retina",
       width = 16, height = 12, units = "cm")



#B&W version
dgraph %>%
  filter(ts == 'Top 1') %>%
  filter(income_var == 'tei' | income_var == 'inv') %>%
  group_by(income_var) %>%
  mutate(prop_income = prop_income/prop_income[1]) %>%
  ggplot(., aes(tax_year, prop_income, group = income_cat, color = income_cat,
                shape = income_cat)) +
  geom_point() +
  geom_line() +
  scale_y_continuous(limits = c(0, 3.5)) +
  scale_shape_manual(values = c(15, 16),
                     labels = c('Earnings', 'Investment')) +
  scale_color_manual(values = c('black', 'grey'),
                     labels = c('Earnings', 'Investment')) +
  xlab('Years') +
  ylab('Top Share, index (1997 = 1)') +
  scale_x_continuous(breaks = seq(1997, 2018, 3)) +
  my_theme +
  theme(legend.position = 'bottom',
        legend.title = element_blank()) +
  theme(text = element_text(size = 16))


# save graph
ggsave(filename = "graph_3a_bw.pdf", path = outpath,  dpi = "retina",
       width = 16, height = 12, units = "cm")



# Industry distribution ---------------------------------------------------

# upload industry distribution data
data <- csv("output/industry_distribution.csv")

#Top earners data (for top 1% total income)
data_ts <- read_csv("output/ts_migrant_comb_18plus.csv")
#recode grouping variable
data_ts <- data_ts %>% 
  mutate(group = case_when(
    group == "all" ~ "all",
    group == "dummy_on" ~ "migrant",
    group == "dummy_off_noreplacem" ~ "brits_noreplace",
    group == "dummy_off_withreplacem" ~ "brits_withreplace",
    TRUE ~ NA_character_
  ))

#Use weighted totals
data <- data %>% mutate(n = n_wt)


## add a leading zero to 4-digit industries

# find 4-digit industries
ind <- grep("^\\d{4}$", data$sic)

# add a leading zero
data$sic[ind] <- paste(0, data$sic[ind], sep = "")


## upload official names

# sic codes
#  csv from https://www.gov.uk/government/publications/standard-industrial-classification-library(kableExtra)of-economic-activities-sic
sic <- read_csv('external/SIC07_CH_condensed_list_en.csv')
names(sic) <- c('sic', 'name')


# # # #

# total count by industry 
dtot <- data %>%
  group_by(sic) %>%
  summarise(tot = sum(n))

# merge back
data <- data %>%
  left_join(dtot, by = 'sic')

# proportion of migrants
data <- data %>%
  mutate(prop = n/tot)

# add names
data <- data %>%
  left_join(sic, by = 'sic')

# total count by group
tt <- data %>%
  group_by(migrant_comb) %>%
  summarise(group_tot = sum(n))

# merge back
data <- data %>%
  left_join(tt, by = 'migrant_comb')

# share by industry
data <- data %>%
  mutate(prop3 = n/group_tot)

# calculate relative income
data <- data %>%
  arrange(sic, migrant_comb) %>%
  group_by(sic) %>%
  mutate(income_ratio = ti/ti[1]) %>% 
  ungroup()


# table
tab <- data %>%
  filter(migrant_comb == 1) %>%
  filter(!sic == 'NA') %>%
  arrange(desc(prop3))

# rank according proportion of migrants within each industry
tab <- tab %>%
  arrange(desc(prop)) %>%
  mutate(rank1 = 1,
         rank1 = cumsum(rank1))

# adjust columns
tab <- tab %>%
  arrange(desc(prop3)) %>%
  mutate(rank = 1,
         rank = cumsum(rank),
         ti_round = floor(ti),
         emp_round = floor(emp_inc),
         prop3_100 = round(prop3*100, 1),
         prop_100 = round(prop*100, 1),
         income_ratio = round(income_ratio, 2)) %>%
  select(rank, name, prop3_100, ti_round, rank1, prop_100, sic, n, income_ratio, tot) 



#Top share income 2017:
data_ts <- data_ts %>% 
  filter(ts == 0.01 & group == "all" & tax_year == 2017) %>% 
  mutate(top1_inc = n * ti)
total_top1_ti <- data_ts$top1_inc


# compute share of total Top 1 income
tab <- tab %>%
  mutate(share_ti = 100*ti_round*n/total_top1_ti,
         share_ti = format(share_ti, nsmall = 2 #, digits = 0) digits argument must be strictly positive?
         ))

#Apply filter of total industry emp > 1000 to both tables
tab <- tab %>%
  filter(tot >= 1000)

# rank by prop 3
tab1 <- tab %>%
  .[1:10, ]


# top 10 names - to make it print nicely
#! This needs to be manually updated
tab1$name
new_names <- c('Banks', 'Hospitals', 'Support to financial services', 'Management consulting', 
               'Medical practice', 'Fund managers', 'Business administration', 'Head offices', 
               'IT consultancy', 'Solicitors')

#Check
bind_cols(tab1$name, new_names)
tab1$name <- new_names

# add SIC code into text name
tab1$name <- paste(tab1$name, " (", tab1$sic, ")", sep = "")

# drop sic and ncolumn
tab1$sic <- NULL
tab1$n <- NULL

# income with thousand separator
tab1 <- tab1 %>%
  mutate(ti_round = as.character(ti_round),
         ti_round = paste(str_extract(ti_round, "^\\d{3}"),
                          ",",
                          str_extract(ti_round, "\\d{3}$"), sep = ""
                          ))

# reorder
tab1 <- tab1 %>%
  select(rank, name, prop3_100, ti_round, prop_100, rank1, share_ti, income_ratio)

# kable latex (FOR THE PAPER. THIS IS TABLE 1A)
tab_latex <- tab1 %>%
  select(-share_ti) %>%
  kable(format = 'latex', booktabs = T,
        align = 'rlrrrrr',
        col.names = c('', "Industry (SIC)", 'Share of Top 1\\% Migrants',
                      'Average Income (\\pounds)', 'Share', 'Rank',
                      'Migrant Premium'),
        escape = F) %>%
  column_spec(column = 2, width = "17em") %>%
  column_spec(column = 3, width = "6em") %>%
  column_spec(column = 4, width = "6em")  %>%
  column_spec(column = 5, width = "5em") %>%
  column_spec(column = 6, width = "5em") %>%
  column_spec(column = 7, width = "5em") %>%
  add_header_above(c('', '', '', '', "Industry Dependency Ratio" = 2, ''))


# save tex file
write_file(tab_latex, "figs/industry_table.tex")


## reverse ranking

# tab
tab2 <- tab %>%
  arrange(desc(prop_100)) %>%
  select(rank1, name, prop_100, ti_round, income_ratio, rank, prop3_100, sic, n) %>%
  .[1:10, ]


# new names
tab2$name
#! This needs to be manually updated
new_names <- c('Oil refining', 'Web portals', 'Dealers', 'Financial management', 'Banks',
               'Other professionals', 'News agencies', 'Fund managers', 'Hospitals', 
               'Support to financial services')
#Check
bind_cols(tab2$name, new_names)
tab2$name <- new_names

# add SIC code into text name
tab2$name <- paste(tab2$name, " (", tab2$sic, ")", sep = "")

# drop sic  and ncolumn
tab2$sic <- NULL
tab2$n <- NULL

# income with thousand separator
tab2 <- tab2 %>%
  mutate(ti_round = as.character(ti_round),
         ti_round = paste(str_extract(ti_round, "^\\d{3}"),
                          ",",
                          str_extract(ti_round, "\\d{3}$"), sep = ""))

# rename columns
names(tab2) <- c('',
                 "Industry (SIC)", 
                 'Share of Top 1pc Workers', 
                 'Average Total Income',
                 'Migrant Premium',
                 'Rank', 'Share')



# kable latex (TABLE 1B)
tab_latex <- tab2 %>%
  #select(-share_ti) %>%
  kable(format = 'latex', booktabs = T,
        align = 'rlrrrrr',
        col.names = c('', "Industry (SIC)", 'Share of Top 1\\% Workers',
                      'Average Income (\\pounds)', 'Migrant Premium', 'Share', 'Rank'
        ),
        escape = F) %>%
  column_spec(column = 2, width = "17em") %>%
  column_spec(column = 3, width = "6em") %>%
  column_spec(column = 4, width = "6em") %>%
  column_spec(column = 5, width = "5em") %>%
  column_spec(column = 6, width = "5em") %>%
  column_spec(column = 7, width = "5em") %>%
  add_header_above(c('', '', '', '', "Industry Dependency Ratio" = 2, ''))

# save tex file
write_file(tab_latex, "figs/industry_table2.tex")



# Year of arrival ---------------------------------------------------------


# upload YOA distribution
data <- read_csv('output/YOA_allyears.csv')

data <- data %>%
  filter(base_year == 2017) %>%
  filter(ts == .01)

#Fig 3b 
data %>%
  filter(base_year == 2017) %>%
  filter(ts == .01) %>%
  add_row(arrival_grouped_max = 2017, group_persist = 'stayer', prop = NA) %>%
  ggplot(., aes(arrival_grouped_max, prop, fill = group_persist)) +
  geom_col(position = 'dodge') +
  scale_fill_manual(values = c('dodgerblue', 'navy'),
                    labels = c('Joiner', 'Stayer')) +
  scale_y_continuous(breaks = seq(0, .1, .02)) +
  xlab('Arrival Year') +
  ylab('Proportion') +
  my_theme +
  theme(legend.position = 'bottom',
        legend.title = element_blank()) +
  theme(text = element_text(size = 16))

# save
ggsave(filename = 'graph_3b.pdf', path = outpath, dpi = 'retina',
       width = 16, height = 12, units = 'cm')

#B&W version
data %>%
  filter(base_year == 2017) %>%
  filter(ts == .01) %>%
  add_row(arrival_grouped_max = 2017, group_persist = 'stayer', prop = NA) %>%
  ggplot(., aes(arrival_grouped_max, prop, fill = group_persist)) +
  geom_col(position = 'dodge') +
  scale_fill_manual(values = c('grey', 'black'),
                    labels = c('Joiner', 'Stayer')) +
  scale_y_continuous(breaks = seq(0, .1, .02)) +
  xlab('Arrival Year') +
  ylab('Proportion') +
  my_theme +
  theme(legend.position = 'bottom',
        legend.title = element_blank()) +
  theme(text = element_text(size = 16))

# save
ggsave(filename = 'graph_3b_bw.pdf', path = outpath, dpi = 'retina',
       width = 16, height = 12, units = 'cm')


# Age distribution --------------------------------------------------------


# upload
data <- read_csv('output/age_joiner_stayer.csv')

# pick year
yy <- 2017

# filter base year
data <- data %>%
  filter(base_year == yy) %>%
  filter(ts == .01)



# Fig 3d, A6b, A6a --------------------------------------------------------


#map(c('all', 'joiner', 'stayer'), function(x){
for (x in c('all', 'joiner', 'stayer')) {  
  #respectively, figA6b, fig3d, figA6a
  
    
  data %>%
    filter(group_persist == x) %>%
    ggplot(., aes(age_grouped_max, prop, fill = as.factor(migrant_comb))) +
    geom_col(position = 'dodge', alpha = .8) +
    scale_fill_manual(values = c('dodgerblue', 'maroon'),
                      labels = c('Natives', 'Migrants')) +
      scale_x_continuous(breaks = seq(20, 80, 10),
                         labels = c('20', '30', '40', '50', '60', '70', '71-80')) +
    scale_y_continuous(limits = c(0, 0.05)) + 
    xlab('Age') +
    ylab('Proportion') +
    my_theme +
    theme(legend.position = 'bottom',
          legend.title = element_blank()) +
      theme(text = element_text(size = 16))
  
    # filename
    file_name <- paste('age_distribution_', x, '.pdf', sep = "")
    
    # save
    ggsave(filename = file_name, path = outpath, dpi = 'retina',
         width = 16, height = 12, units = 'cm')

}

#B&W versions
for (x in c('all', 'joiner', 'stayer')) {  

  data %>%
    filter(group_persist == x) %>%
    ggplot(., aes(age_grouped_max, prop, fill = as.factor(migrant_comb))) +
    geom_col(position = 'dodge', alpha = .8) +
    scale_fill_manual(values = c('black', 'grey'),
                      labels = c('Natives', 'Migrants')) +
    scale_x_continuous(breaks = seq(20, 80, 10),
                       labels = c('20', '30', '40', '50', '60', '70', '71-80')) +
    scale_y_continuous(limits = c(0, 0.05)) + 
    xlab('Age') +
    ylab('Proportion') +
    my_theme +
    theme(legend.position = 'bottom',
          legend.title = element_blank()) +
    theme(text = element_text(size = 16))
  
  # filename
  file_name <- paste('age_distribution_', x, '_bw.pdf', sep = "")
  
  # save
  ggsave(filename = file_name, path = outpath, dpi = 'retina',
         width = 16, height = 12, units = 'cm')
  
}


# Mobility ----------------------------------------------------------------

# upload data
data <- read_csv('output/mobility.csv')

# bar chart with all data
dgraph <- data %>%
  gather(key = percentile, value = count, p99_100, p98_99, 
         p95_98, p90_95, p0_90, nid) %>%
  mutate(xlabel = if_else(migrant == "migrant", "Migrants", "Natives"),
         xlabel2 = paste(xlabel, " ", year_before, "y", sep = ""),
         xlabel3 = case_when(xlabel2 == "Migrants 1y" ~ "a",
                             xlabel2 == "Natives 1y" ~ "b",
                             xlabel2 == "Migrants 5y" ~ "c",
                             xlabel2 == "Natives 5y" ~ "d",
                             xlabel2 == "Migrants 10y" ~ "e",
                             xlabel2 == "Natives 10y" ~ "f"))

# graph
dgraph %>%
  mutate(xlabel4 = factor(xlabel2, levels = c("Migrants 1y", "Natives 1y",
                                              "Migrants 5y", "Natives 5y",
                                              "Migrants 10y", "Natives 10y"))) %>%
  ggplot(., aes(x = xlabel4, y = count, fill = percentile)) +
  geom_bar(position = "fill", stat = "identity", width = .8) +
  scale_fill_manual(
    values = c("grey", "grey30", "black", "lightblue", "royalblue", "navy"),
    labels = c("Not in data", "P0-P90", "P90-P95", "P95-P98", "P98-P99", "Top 1"),
    name = "Previous Location"
  ) +
  xlab("") +
  ylab("Groupwise Proportion") +
  my_theme +
  theme(legend.position = 'right')

ggsave(filename = "graph_3c.pdf", path = outpath, dpi = "retina",
       width = 16, height = 12, units = "cm")

#B&W version (greyscale)
totals <- dgraph %>%
  group_by(xlabel2) %>% 
  summarise(total = sum(count)) 

dgraph %>%
  mutate(
    xlabel4 = factor(xlabel2, levels = c("Migrants 1y", "Natives 1y",
                                         "Migrants 5y", "Natives 5y",
                                         "Migrants 10y", "Natives 10y")),
    `Previous Location` = case_when(
      percentile == "nid" ~ "Not in data",
      percentile == "p0_90" ~ "P0-P90",
      percentile == "p90_95" ~ "P90-P95",
      percentile == "p95_98" ~ "P95-P98",
      percentile == "p98_99" ~ "P98-P99",
      percentile == "p99_100" ~ "Top 1",
      TRUE ~ NA_character_
    ),
    timeframe = case_when(
      xlabel4 %in% c("Migrants 1y", "Natives 1y") ~ "1yr prev",
      xlabel4 %in% c("Migrants 5y", "Natives 5y") ~ "5yrs prev",
      xlabel4 %in% c("Migrants 10y", "Natives 10y") ~ "10yrs prev",
      TRUE ~ NA_character_
    )
  ) %>%
  left_join(totals) %>%
  mutate(share = count / total) %>%
  ggplot(., aes(x = xlabel4, y = share, fill = `Previous Location`)) +
  geom_bar(position = "stack", stat = "identity", width = .8, color = "black") +
  xlab("") +
  ylab("Groupwise Proportion") +
  scale_fill_manual(values = c("white", "grey10", "grey50", "grey30", "grey80", "black")) +
  scale_x_discrete(labels = c(
    "Migrants 1y" = "Native\n1yr prev",
    "Natives 1y" = "Migrant\n1yr prev",
    "Migrants 5y" = "Native\n5yrs prev",
    "Natives 5y" = "Migrant\n5yrs prev",
    "Migrants 10y" = "Native\n10yrs prev",
    "Natives 10y" = "Migrant\n10yrs prev"
  )) +
  my_theme +
  theme(legend.position = 'right', legend.key.size = unit(1, 'cm'))

ggsave(filename = "graph_3c_bw.pdf", path = outpath, dpi = "retina",
       width = 16, height = 12, units = "cm")
